//go:build !noasm && amd64
// AUTO-GENERATED BY GOAT -- DO NOT EDIT

TEXT ·l2_byte_256(SB), $0-32
	MOVQ a+0(FP), DI
	MOVQ b+8(FP), SI
	MOVQ res+16(FP), DX
	MOVQ len+24(FP), CX
	BYTE $0x55               // pushq	%rbp
	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
	BYTE $0x53               // pushq	%rbx
	LONG $0xf8e48348         // andq	$-8, %rsp
	WORD $0x8b4c; BYTE $0x11 // movq	(%rcx), %r10
	WORD $0x8945; BYTE $0xd0 // movl	%r10d, %r8d
	LONG $0x20fa8341         // cmpl	$32, %r10d
	JGE  LBB0_1
	WORD $0x8545; BYTE $0xd2 // testl	%r10d, %r10d
	JLE  LBB0_7
	LONG $0x10f88341         // cmpl	$16, %r8d
	JAE  LBB0_10
	WORD $0x3145; BYTE $0xdb // xorl	%r11d, %r11d
	WORD $0x3145; BYTE $0xc9 // xorl	%r9d, %r9d
	JMP  LBB0_13

LBB0_1:
	WORD $0x634d; BYTE $0xc8 // movslq	%r8d, %r9
	LONG $0xc0eff9c5         // vpxor	%xmm0, %xmm0, %xmm0
	WORD $0xdb31             // xorl	%ebx, %ebx
	WORD $0x894c; BYTE $0xc0 // movq	%r8, %rax
	LONG $0xc9eff1c5         // vpxor	%xmm1, %xmm1, %xmm1

LBB0_2:
	WORD $0x8948; BYTE $0xd9       // movq	%rbx, %rcx
	LONG $0x146ffec5; BYTE $0x1f   // vmovdqu	(%rdi,%rbx), %ymm2
	LONG $0x1c6ffec5; BYTE $0x1e   // vmovdqu	(%rsi,%rbx), %ymm3
	LONG $0xe060edc5               // vpunpcklbw	%ymm0, %ymm2, %ymm4     # ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
	LONG $0xe860e5c5               // vpunpcklbw	%ymm0, %ymm3, %ymm5     # ymm5 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
	LONG $0xe5f9ddc5               // vpsubw	%ymm5, %ymm4, %ymm4
	LONG $0xd068edc5               // vpunpckhbw	%ymm0, %ymm2, %ymm2     # ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
	LONG $0xd868e5c5               // vpunpckhbw	%ymm0, %ymm3, %ymm3     # ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
	LONG $0xd3f9edc5               // vpsubw	%ymm3, %ymm2, %ymm2
	LONG $0xdcf5ddc5               // vpmaddwd	%ymm4, %ymm4, %ymm3
	LONG $0xc9fee5c5               // vpaddd	%ymm1, %ymm3, %ymm1
	LONG $0xd2f5edc5               // vpmaddwd	%ymm2, %ymm2, %ymm2
	LONG $0xcafef5c5               // vpaddd	%ymm2, %ymm1, %ymm1
	LONG $0x20c38348               // addq	$32, %rbx
	LONG $0xe0c08348               // addq	$-32, %rax
	LONG $0x3fc18348               // addq	$63, %rcx
	WORD $0x394c; BYTE $0xc9       // cmpq	%r9, %rcx
	JL   LBB0_2
	LONG $0x397de3c4; WORD $0x01c8 // vextracti128	$1, %ymm1, %xmm0
	LONG $0xc1fef9c5               // vpaddd	%xmm1, %xmm0, %xmm0
	LONG $0xc870f9c5; BYTE $0x1b   // vpshufd	$27, %xmm0, %xmm1               # xmm1 = xmm0[3,2,1,0]
	LONG $0xc0fef1c5               // vpaddd	%xmm0, %xmm1, %xmm0
	LONG $0xc870f9c5; BYTE $0x55   // vpshufd	$85, %xmm0, %xmm1               # xmm1 = xmm0[1,1,1,1]
	LONG $0xc0fef1c5               // vpaddd	%xmm0, %xmm1, %xmm0
	LONG $0x7e79c1c4; BYTE $0xc1   // vmovd	%xmm0, %r9d
	WORD $0x3944; BYTE $0xd3       // cmpl	%r10d, %ebx
	JGE  LBB0_18
	WORD $0x894d; BYTE $0xc2       // movq	%r8, %r10
	WORD $0x2949; BYTE $0xda       // subq	%rbx, %r10
	LONG $0x20fa8349               // cmpq	$32, %r10
	JAE  LBB0_14
	WORD $0x8948; BYTE $0xd8       // movq	%rbx, %rax
	JMP  LBB0_17

LBB0_7:
	WORD $0x3145; BYTE $0xc9 // xorl	%r9d, %r9d
	JMP  LBB0_18

LBB0_10:
	WORD $0x8945; BYTE $0xc2 // movl	%r8d, %r10d
	LONG $0x0fe28341         // andl	$15, %r10d
	WORD $0x8945; BYTE $0xc3 // movl	%r8d, %r11d
	LONG $0xf0e38341         // andl	$-16, %r11d
	LONG $0xc0eff9c5         // vpxor	%xmm0, %xmm0, %xmm0
	WORD $0xc031             // xorl	%eax, %eax
	LONG $0xc9eff1c5         // vpxor	%xmm1, %xmm1, %xmm1
	LONG $0xd2efe9c5         // vpxor	%xmm2, %xmm2, %xmm2
	LONG $0xdbefe1c5         // vpxor	%xmm3, %xmm3, %xmm3

LBB0_11:
	LONG $0x327de2c4; WORD $0x0724             // vpmovzxbq	(%rdi,%rax), %ymm4      # ymm4 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0x327de2c4; WORD $0x076c; BYTE $0x04 // vpmovzxbq	4(%rdi,%rax), %ymm5     # ymm5 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0x327de2c4; WORD $0x0774; BYTE $0x08 // vpmovzxbq	8(%rdi,%rax), %ymm6     # ymm6 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0x327de2c4; WORD $0x077c; BYTE $0x0c // vpmovzxbq	12(%rdi,%rax), %ymm7    # ymm7 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0x327d62c4; WORD $0x0604             // vpmovzxbq	(%rsi,%rax), %ymm8      # ymm8 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0xfb5dc1c4; BYTE $0xe0               // vpsubq	%ymm8, %ymm4, %ymm4
	LONG $0x327d62c4; WORD $0x0644; BYTE $0x04 // vpmovzxbq	4(%rsi,%rax), %ymm8     # ymm8 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0xfb55c1c4; BYTE $0xe8               // vpsubq	%ymm8, %ymm5, %ymm5
	LONG $0x327d62c4; WORD $0x0644; BYTE $0x08 // vpmovzxbq	8(%rsi,%rax), %ymm8     # ymm8 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0x327d62c4; WORD $0x064c; BYTE $0x0c // vpmovzxbq	12(%rsi,%rax), %ymm9    # ymm9 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
	LONG $0xfb4dc1c4; BYTE $0xf0               // vpsubq	%ymm8, %ymm6, %ymm6
	LONG $0xfb45c1c4; BYTE $0xf9               // vpsubq	%ymm9, %ymm7, %ymm7
	LONG $0x285de2c4; BYTE $0xe4               // vpmuldq	%ymm4, %ymm4, %ymm4
	LONG $0xc0d4ddc5                           // vpaddq	%ymm0, %ymm4, %ymm0
	LONG $0x2855e2c4; BYTE $0xe5               // vpmuldq	%ymm5, %ymm5, %ymm4
	LONG $0xc9d4ddc5                           // vpaddq	%ymm1, %ymm4, %ymm1
	LONG $0x284de2c4; BYTE $0xe6               // vpmuldq	%ymm6, %ymm6, %ymm4
	LONG $0xd2d4ddc5                           // vpaddq	%ymm2, %ymm4, %ymm2
	LONG $0x2845e2c4; BYTE $0xe7               // vpmuldq	%ymm7, %ymm7, %ymm4
	LONG $0xdbd4ddc5                           // vpaddq	%ymm3, %ymm4, %ymm3
	LONG $0x10c08348                           // addq	$16, %rax
	WORD $0x3949; BYTE $0xc3                   // cmpq	%rax, %r11
	JNE  LBB0_11
	LONG $0xc0d4f5c5                           // vpaddq	%ymm0, %ymm1, %ymm0
	LONG $0xc0d4edc5                           // vpaddq	%ymm0, %ymm2, %ymm0
	LONG $0xc0d4e5c5                           // vpaddq	%ymm0, %ymm3, %ymm0
	LONG $0x397de3c4; WORD $0x01c1             // vextracti128	$1, %ymm0, %xmm1
	LONG $0xc1d4f9c5                           // vpaddq	%xmm1, %xmm0, %xmm0
	LONG $0xc870f9c5; BYTE $0xee               // vpshufd	$238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
	LONG $0xc1d4f9c5                           // vpaddq	%xmm1, %xmm0, %xmm0
	LONG $0x7ef9c1c4; BYTE $0xc1               // vmovq	%xmm0, %r9
	WORD $0x854d; BYTE $0xd2                   // testq	%r10, %r10
	JE   LBB0_18

LBB0_13:
	LONG $0x04b60f42; BYTE $0x1f // movzbl	(%rdi,%r11), %eax
	LONG $0x0cb60f42; BYTE $0x1e // movzbl	(%rsi,%r11), %ecx
	WORD $0x2948; BYTE $0xc8     // subq	%rcx, %rax
	LONG $0xc0af0f48             // imulq	%rax, %rax
	WORD $0x0149; BYTE $0xc1     // addq	%rax, %r9
	LONG $0x01c38349             // addq	$1, %r11
	WORD $0x394d; BYTE $0xd8     // cmpq	%r11, %r8
	JNE  LBB0_13
	JMP  LBB0_18

LBB0_14:
	WORD $0x894d; BYTE $0xd3     // movq	%r10, %r11
	LONG $0xe0e38349             // andq	$-32, %r11
	LONG $0xe0e08348             // andq	$-32, %rax
	WORD $0x0148; BYTE $0xd8     // addq	%rbx, %rax
	LONG $0x18c38348             // addq	$24, %rbx
	LONG $0x6e79c1c4; BYTE $0xc1 // vmovd	%r9d, %xmm0
	LONG $0xc9eff1c5             // vpxor	%xmm1, %xmm1, %xmm1
	WORD $0x894d; BYTE $0xd9     // movq	%r11, %r9
	LONG $0xd2efe9c5             // vpxor	%xmm2, %xmm2, %xmm2
	LONG $0xdbefe1c5             // vpxor	%xmm3, %xmm3, %xmm3

LBB0_15:
	LONG $0x317de2c4; WORD $0x1f64; BYTE $0xe8 // vpmovzxbd	-24(%rdi,%rbx), %ymm4   # ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0x317de2c4; WORD $0x1f6c; BYTE $0xf0 // vpmovzxbd	-16(%rdi,%rbx), %ymm5   # ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0x317de2c4; WORD $0x1f74; BYTE $0xf8 // vpmovzxbd	-8(%rdi,%rbx), %ymm6    # ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0x317de2c4; WORD $0x1f3c             // vpmovzxbd	(%rdi,%rbx), %ymm7      # ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0x317d62c4; WORD $0x1e44; BYTE $0xe8 // vpmovzxbd	-24(%rsi,%rbx), %ymm8   # ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0xfa5dc1c4; BYTE $0xe0               // vpsubd	%ymm8, %ymm4, %ymm4
	LONG $0x317d62c4; WORD $0x1e44; BYTE $0xf0 // vpmovzxbd	-16(%rsi,%rbx), %ymm8   # ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0xfa55c1c4; BYTE $0xe8               // vpsubd	%ymm8, %ymm5, %ymm5
	LONG $0x317d62c4; WORD $0x1e44; BYTE $0xf8 // vpmovzxbd	-8(%rsi,%rbx), %ymm8    # ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0x317d62c4; WORD $0x1e0c             // vpmovzxbd	(%rsi,%rbx), %ymm9      # ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
	LONG $0xfa4dc1c4; BYTE $0xf0               // vpsubd	%ymm8, %ymm6, %ymm6
	LONG $0xfa45c1c4; BYTE $0xf9               // vpsubd	%ymm9, %ymm7, %ymm7
	LONG $0x405de2c4; BYTE $0xe4               // vpmulld	%ymm4, %ymm4, %ymm4
	LONG $0xc0feddc5                           // vpaddd	%ymm0, %ymm4, %ymm0
	LONG $0x4055e2c4; BYTE $0xe5               // vpmulld	%ymm5, %ymm5, %ymm4
	LONG $0xc9feddc5                           // vpaddd	%ymm1, %ymm4, %ymm1
	LONG $0x404de2c4; BYTE $0xe6               // vpmulld	%ymm6, %ymm6, %ymm4
	LONG $0xd2feddc5                           // vpaddd	%ymm2, %ymm4, %ymm2
	LONG $0x4045e2c4; BYTE $0xe7               // vpmulld	%ymm7, %ymm7, %ymm4
	LONG $0xdbfeddc5                           // vpaddd	%ymm3, %ymm4, %ymm3
	LONG $0x20c38348                           // addq	$32, %rbx
	LONG $0xe0c18349                           // addq	$-32, %r9
	JNE  LBB0_15
	LONG $0xc0fef5c5                           // vpaddd	%ymm0, %ymm1, %ymm0
	LONG $0xc0feedc5                           // vpaddd	%ymm0, %ymm2, %ymm0
	LONG $0xc0fee5c5                           // vpaddd	%ymm0, %ymm3, %ymm0
	LONG $0x397de3c4; WORD $0x01c1             // vextracti128	$1, %ymm0, %xmm1
	LONG $0xc1fef9c5                           // vpaddd	%xmm1, %xmm0, %xmm0
	LONG $0xc870f9c5; BYTE $0xee               // vpshufd	$238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
	LONG $0xc1fef9c5                           // vpaddd	%xmm1, %xmm0, %xmm0
	LONG $0xc870f9c5; BYTE $0x55               // vpshufd	$85, %xmm0, %xmm1               # xmm1 = xmm0[1,1,1,1]
	LONG $0xc1fef9c5                           // vpaddd	%xmm1, %xmm0, %xmm0
	LONG $0x7e79c1c4; BYTE $0xc1               // vmovd	%xmm0, %r9d
	WORD $0x394d; BYTE $0xda                   // cmpq	%r11, %r10
	JE   LBB0_18

LBB0_17:
	LONG $0x070cb60f         // movzbl	(%rdi,%rax), %ecx
	LONG $0x061cb60f         // movzbl	(%rsi,%rax), %ebx
	WORD $0xd929             // subl	%ebx, %ecx
	WORD $0xaf0f; BYTE $0xc9 // imull	%ecx, %ecx
	WORD $0x0141; BYTE $0xc9 // addl	%ecx, %r9d
	LONG $0x01c08348         // addq	$1, %rax
	WORD $0x3949; BYTE $0xc0 // cmpq	%rax, %r8
	JNE  LBB0_17

LBB0_18:
	WORD $0x8944; BYTE $0x0a // movl	%r9d, (%rdx)
	LONG $0xf8658d48         // leaq	-8(%rbp), %rsp
	BYTE $0x5b               // popq	%rbx
	BYTE $0x5d               // popq	%rbp
	WORD $0xf8c5; BYTE $0x77 // vzeroupper
	BYTE $0xc3               // retq
